{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "iris = datasets.load_iris()\n",
    "x = iris.data\n",
    "y = iris.target\n",
    "\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of KNClassifier: 0.9555555555555556\n"
     ]
    }
   ],
   "source": [
    "### 使用模块内置的分类器\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "clf = KNeighborsClassifier()\n",
    "clf.fit(x_train, y_train)\n",
    "pred = clf.predict(x_test)\n",
    "\n",
    "acc = accuracy_score(pred, y_test)\n",
    "\n",
    "print('Accuracy of KNClassifier:', acc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Python实现KNN分类器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of MyClassifier: 0.9777777777777777\n"
     ]
    }
   ],
   "source": [
    "##  自写分类器，被测样本点，和它距离最近的点为同一类\n",
    "from scipy.spatial import distance\n",
    "\n",
    "\n",
    "def euc(a, b):\n",
    "    # 欧式距离\n",
    "    return distance.euclidean(a, b)\n",
    "\n",
    "\n",
    "class ScrappyKNN():\n",
    "\n",
    "    # fit接口\n",
    "    def fit(self, x_train, y_train):\n",
    "        self.x_train = x_train\n",
    "        self.y_train = y_train\n",
    "\n",
    "    # predict接口\n",
    "    def predict(self, x_test):\n",
    "        self.x_test = x_test\n",
    "        predictions = []\n",
    "        for row in x_test:\n",
    "            label = self.closest(row)\n",
    "            predictions.append(label)\n",
    "        return (predictions)\n",
    "\n",
    "    def closest(self, row):\n",
    "        # x_train中与row最近的点，与row同一类别\n",
    "        best_dist = euc(row, self.x_train[0])\n",
    "        best_index = 0\n",
    "        for i in range(1, len(x_train)):\n",
    "            dist = euc(row, self.x_train[i])\n",
    "            if dist < best_dist:\n",
    "                best_dist = dist\n",
    "                best_index = i\n",
    "        return (self.y_train[best_index])\n",
    "\n",
    "\n",
    "clf = ScrappyKNN()\n",
    "clf.fit(x_train, y_train)\n",
    "pred = clf.predict(x_test)\n",
    "acc = accuracy_score(pred, y_test)\n",
    "print('Accuracy of MyClassifier:', acc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Numpy实现的KNN分类器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "\n",
    "def euclidean_distance(x1, x2):\n",
    "    # distance = 0\n",
    "    # for i in range(len(x1)):\n",
    "    #     distance += pow(x1[i] - x2[i], 2)\n",
    "    # return math.sqrt(distance)\n",
    "    return np.sqrt(np.sum((x1 - x2)**2))\n",
    "\n",
    "\n",
    "class KNN():\n",
    "    def __init__(self, k):\n",
    "        self.k = k\n",
    "\n",
    "    def _vote(self, neighbor_labels):\n",
    "        counts = np.bincount(neighbor_labels.astype('int'))\n",
    "        return counts.argmax()\n",
    "\n",
    "    def predict(self, X_test, X_train, y_train):\n",
    "        y_pred = np.empty(X_test.shape[0])\n",
    "        for i, test_sample in enumerate(X_test):\n",
    "            idx = np.argsort(\n",
    "                [euclidean_distance(test_sample, x) for x in X_train])[:self.k]\n",
    "            k_nearest_heighbors = np.array([y_train[i] for i in idx])\n",
    "            y_pred[i] = self._vote(k_nearest_heighbors)\n",
    "\n",
    "        return y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of My_KNNeighbourClassifier: 0.9555555555555556\n"
     ]
    }
   ],
   "source": [
    "# 自写分类器，被测点与和它最近的k个点为同一类\n",
    "import numpy\n",
    "\n",
    "\n",
    "class KNearestNeighbours:\n",
    "    def __init__(self, k=5):\n",
    "        self.k = k\n",
    "\n",
    "    # fit接口\n",
    "    def fit(self, x_train, y_train):\n",
    "        self.x = x_train\n",
    "        self.y = y_train\n",
    "\n",
    "    # predict接口\n",
    "    def predict(self, x_test):\n",
    "        def EuclidDist(testPoint, checkPint):\n",
    "            distance = numpy.linalg.norm(testPoint - checkPint)\n",
    "            return (distance)\n",
    "\n",
    "        def closest(testPoint):\n",
    "            distArray = numpy.array(\n",
    "                [(EuclidDist(testPoint, self.x[i]), self.y[i])\n",
    "                 for i in range(len(self.x))],\n",
    "                dtype=[('dist', float), ('lab', int)])\n",
    "            distArray.sort(order='dist')\n",
    "            majority = {}\n",
    "            for j in range(self.k):\n",
    "                if majority.get(distArray[j][1]) == None:\n",
    "                    majority[distArray[j][1]] = 0\n",
    "                else:\n",
    "                    majority[distArray[j][1]] += 1\n",
    "            return (max(majority, key=majority.get))\n",
    "\n",
    "        if x_test.ndim == 1:\n",
    "            return (closest(x_test))\n",
    "        else:\n",
    "            predictions = numpy.array([closest(point) for point in x_test])\n",
    "            return (predictions)\n",
    "\n",
    "\n",
    "clf = KNearestNeighbours(k=5)\n",
    "clf.fit(x_train, y_train)\n",
    "pred = clf.predict(x_test)\n",
    "acc = accuracy_score(pred, y_test)\n",
    "print('Accuracy of My_KNNeighbourClassifier:', acc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### python实现决策树"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 自写决策树.类似上述fit和predict接口！！！！！！！！！！！！！！！！！！！\n",
    "\n",
    "# 原始数据\n",
    "training_data = [\n",
    "    ['Green', 3, 'Apple'],\n",
    "    ['Yellow', 3, 'Apple'],\n",
    "    ['Red', 1, 'Grape'],\n",
    "    ['Red', 1, 'Grape'],\n",
    "    ['Yellow', 3, 'Lemon'],\n",
    "]\n",
    "header = [\"color\", \"diameter\", \"label\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Green', 'Red', 'Yellow'}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 删除列中特征值重复的值\n",
    "\n",
    "\n",
    "def unique_vals(rows, col):\n",
    "    return (set([row[col] for row in rows]))\n",
    "\n",
    "\n",
    "unique_vals(training_data, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Apple': 2, 'Grape': 2, 'Lemon': 1}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 获得训练数据中每一类的样本数\n",
    "\n",
    "\n",
    "def class_counts(rows):\n",
    "    counts = {}\n",
    "    for row in rows:\n",
    "        label = row[-1]\n",
    "        if label not in counts:\n",
    "            counts[label] = 0\n",
    "        counts[label] += 1\n",
    "    return (counts)\n",
    "\n",
    "\n",
    "class_counts(training_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 原始数据中特征值有文本有数字，做出判断\n",
    "def is_numeric(value):\n",
    "    return (isinstance(value, int) or isinstance(value, float))\n",
    "\n",
    "\n",
    "is_numeric(training_data[0][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Is color == Green?\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 用来分类的问题\n",
    "\n",
    "\n",
    "class Question:\n",
    "\n",
    "    #对每一列按value值做出判断\n",
    "    def __init__(self, column, value):\n",
    "        self.column = column\n",
    "        self.value = value\n",
    "\n",
    "    def match(self, example):\n",
    "        val = example[self.column]\n",
    "        if is_numeric(val):\n",
    "            return (val >= self.value)\n",
    "        else:\n",
    "            return (val == self.value)\n",
    "\n",
    "    def __repr__(self):\n",
    "        condition = '=='\n",
    "        if is_numeric(self.value):\n",
    "            condition = '>='\n",
    "        return ('Is %s %s %s?' %\n",
    "                (header[self.column], condition, str(self.value)))\n",
    "\n",
    "\n",
    "q = Question(0, 'Green')\n",
    "print(q)\n",
    "q.match(training_data[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['Red', 1, 'Grape'], ['Red', 1, 'Grape']] [['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Yellow', 3, 'Lemon']]\n",
      "[['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Yellow', 3, 'Lemon']] [['Red', 1, 'Grape'], ['Red', 1, 'Grape']]\n"
     ]
    }
   ],
   "source": [
    "# 用问题对数据分组\n",
    "\n",
    "\n",
    "def partition(rows, question):\n",
    "    true_rows = []\n",
    "    false_rows = []\n",
    "    for row in rows:\n",
    "        if question.match(row):\n",
    "            true_rows.append(row)\n",
    "        else:\n",
    "            false_rows.append(row)\n",
    "    return (true_rows, false_rows)\n",
    "\n",
    "\n",
    "true_rows, false_rows = partition(training_data, Question(0, 'Red'))\n",
    "print(true_rows, false_rows)\n",
    "\n",
    "true_rows, false_rows = partition(training_data, Question(1, 2))\n",
    "print(true_rows, false_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6399999999999999"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 判断分类效果的参数\n",
    "\n",
    "\n",
    "def gini(rows):\n",
    "    counts = class_counts(rows)\n",
    "    impurity = 1\n",
    "    for label in counts:\n",
    "        prob_of_label = counts[label] / float(len(rows))\n",
    "        impurity -= prob_of_label**2\n",
    "    return (impurity)\n",
    "\n",
    "\n",
    "gini(training_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# information gain\n",
    "# decision tree algorithm:maximize information gain\n",
    "\n",
    "\n",
    "def info_gain(left, right, current_uncertainty):\n",
    "    p = float(len(left)) / (len(left) + len(right))\n",
    "    return (current_uncertainty - p * gini(left) - (1 - p) * gini(right))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_best_split(rows):\n",
    "    best_gain = 0\n",
    "    best_question = None\n",
    "    current_uncertainty = gini(rows)\n",
    "    n_features = len(rows[0]) - 1\n",
    "\n",
    "    for col in range(n_features):\n",
    "        values = set([row[col] for row in rows])\n",
    "        for val in values:\n",
    "            question = Question(col, val)\n",
    "            true_rows, false_rows = partition(rows, question)\n",
    "            if len(true_rows) == 0 or len(false_rows) == 0:\n",
    "                continue\n",
    "            gain = info_gain(true_rows, false_rows, current_uncertainty)\n",
    "            if gain >= best_gain:\n",
    "                best_gain, best_question = gain, question\n",
    "    return (best_gain, best_question)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Leaf:\n",
    "    def __init__(self, rows):\n",
    "        self.predictions = class_counts(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Decision_Node:\n",
    "    def __init__(self, question, true_branch, false_branch):\n",
    "        self.question = question\n",
    "        self.true_branch = true_branch\n",
    "        self.false_branch = false_branch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_tree(rows):\n",
    "    gain, question = find_best_split(rows)\n",
    "    if gain == 0:\n",
    "        return (Leaf(rows))\n",
    "    true_rows, false_rows = partition(rows, question)\n",
    "    true_branch = build_tree(true_rows)\n",
    "    false_branch = build_tree(false_rows)\n",
    "    return (Decision_Node(question, true_branch, false_branch))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_tree(node, spacing=\"\"):\n",
    "    if isinstance(node, Leaf):\n",
    "        print(spacing + \"predict\", node.predictions)\n",
    "        return\n",
    "    print(spacing + str(node.question))\n",
    "\n",
    "    print(spacing + '-->True')\n",
    "    print_tree(node.true_branch, spacing + ' ')\n",
    "\n",
    "    print(spacing + '--> False:')\n",
    "    print_tree(node.false_branch, spacing + \"  \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_tree=build_tree(training_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Is diameter >= 3?\n",
      "-->True\n",
      " Is color == Yellow?\n",
      " -->True\n",
      "  predict {'Apple': 1, 'Lemon': 1}\n",
      " --> False:\n",
      "   predict {'Apple': 1}\n",
      "--> False:\n",
      "  predict {'Grape': 2}\n"
     ]
    }
   ],
   "source": [
    "print_tree(my_tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
